pkgs <- c("sf", "gstat", "mapview", "dplyr",
"rvest", "xml2",
"nomisr", "osmdata", "tidyr")
lapply(pkgs, require, character.only = TRUE)sessionInfo()## R version 4.2.2 (2022-10-31 ucrt)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 19044)
##
## Matrix products: default
##
## locale:
## [1] LC_COLLATE=English_United Kingdom.utf8
## [2] LC_CTYPE=English_United Kingdom.utf8
## [3] LC_MONETARY=English_United Kingdom.utf8
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United Kingdom.utf8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] tidyr_1.3.0 osmdata_0.2.0 nomisr_0.4.7 xml2_1.3.3 rvest_1.0.3
## [6] dplyr_1.1.0 mapview_2.11.0 gstat_2.0-9 sf_1.0-8
##
## loaded via a namespace (and not attached):
## [1] Rcpp_1.0.10 lattice_0.20-45 FNN_1.1.3.1 png_0.1-7
## [5] class_7.3-20 zoo_1.8-11 digest_0.6.29 utf8_1.2.2
## [9] plyr_1.8.7 R6_2.5.1 rsdmx_0.6 stats4_4.2.2
## [13] evaluate_0.20 e1071_1.7-11 httr_1.4.4 pillar_1.8.1
## [17] rlang_1.1.0 rstudioapi_0.14 raster_3.6-3 jquerylib_0.1.4
## [21] rmarkdown_2.20 webshot_0.5.4 htmlwidgets_1.5.4 munsell_0.5.0
## [25] proxy_0.4-27 compiler_4.2.2 xfun_0.37 pkgconfig_2.0.3
## [29] base64enc_0.1-3 htmltools_0.5.4 tidyselect_1.2.0 tibble_3.1.8
## [33] intervals_0.15.2 codetools_0.2-18 XML_3.99-0.11 fansi_1.0.3
## [37] spacetime_1.2-8 grid_4.2.2 jsonlite_1.8.4 satellite_1.0.4
## [41] lifecycle_1.0.3 DBI_1.1.3 magrittr_2.0.3 units_0.8-0
## [45] scales_1.2.1 KernSmooth_2.23-20 cli_3.4.1 cachem_1.0.7
## [49] leaflet_2.1.1 sp_1.5-0 snakecase_0.11.0 bslib_0.4.2
## [53] xts_0.12.1 generics_0.1.3 vctrs_0.6.0 tools_4.2.2
## [57] leafem_0.2.0 glue_1.6.2 purrr_1.0.1 crosstalk_1.2.0
## [61] fastmap_1.1.0 yaml_2.3.5 colorspace_2.0-3 terra_1.6-17
## [65] classInt_0.4-8 knitr_1.42 sass_0.4.5
# Create subdir for data if not exists
dn <- "_data"
ifelse(dir.exists(dn), "Exists", dir.create(dn))## [1] "Exists"
This example is inspired by Biegert, Kühhirt, and Van Lancker (2023).
Let’s try to get some football player stats from FBREF. On the home page, we see a list of trending player pages: https://fbref.com/en/ Let’s see if we can get this information into R.
Inspecting the head shots of players give is the following xpath:
//*[@id="players"]/div[3]
# Scrape the website
url <- "https://fbref.com/en/"
parsed <- read_html(url)
# Save xml (so we have the website locally)
write_xml(parsed, file = "_data/fbplayers.xml")# Read back in
parsed <- read_html("_data/fbplayers.xml")
# Select the desired element
parsed.sub <- html_elements(parsed, xpath = '//*[@id="players"]/div[3]')
trending.players <- html_elements(parsed.sub, "a")
trending.players## {xml_nodeset (20)}
## [1] <a href="/en/players/16380240/Mesut-Ozil">Mesut Özil</a>
## [2] <a href="/en/players/d70ce98e/Lionel-Messi">Lionel Messi</a>
## [3] <a href="/en/players/dea698d9/Cristiano-Ronaldo">Cristiano Ronaldo</a>
## [4] <a href="/en/players/1f44ac21/Erling-Haaland">Erling Haaland</a>
## [5] <a href="/en/players/6f806d99/Enzo-Le-Fee">Enzo Le Fée</a>
## [6] <a href="/en/players/ad82197c/Axel-Disasi">Axel Disasi</a>
## [7] <a href="/en/players/bc7dc64d/Bukayo-Saka">Bukayo Saka</a>
## [8] <a href="/en/players/35a6e5c7/Gabriel-Veiga">Gabriel Veiga</a>
## [9] <a href="/en/players/57d88cf9/Jude-Bellingham">Jude Bellingham</a>
## [10] <a href="/en/players/21a66f6a/Harry-Kane">Harry Kane</a>
## [11] <a href="/en/players/e46012d4/Kevin-De-Bruyne">Kevin De Bruyne</a>
## [12] <a href="/en/players/8c90fd7a/Victor-Osimhen">Victor Osimhen</a>
## [13] <a href="/en/players/69384e5d/Neymar">Neymar</a>
## [14] <a href="/en/players/dea88efd/Khvicha-Kvaratskhelia">Khvicha Kvaratskhel ...
## [15] <a href="/en/players/1c7012b8/Declan-Rice">Declan Rice</a>
## [16] <a href="/en/players/1bacc518/Frenkie-de-Jong">Frenkie de Jong</a>
## [17] <a href="/en/players/507c7bdf/Bruno-Fernandes">Bruno Fernandes</a>
## [18] <a href="/en/players/19cae58d/Gavi">Gavi</a>
## [19] <a href="/en/players/31822f8c/Folarin-Balogun">Folarin Balogun</a>
## [20] <a href="/en/players/42fd9c7f/Kylian-Mbappe">Kylian Mbappé</a>
We have extracted the set of players with the respective links. Now,
we separate these into the names with html_text2() and the
links using html_attrs().
# Data frame with names and links of players
trending.players.df <- data.frame(names = html_text2(trending.players),
links = unlist(html_attrs(trending.players)))
# Extend the links
trending.players.df$links <- paste0("https://fbref.com/", trending.players.df$links)
head(trending.players.df)## names links
## 1 Mesut Özil https://fbref.com//en/players/16380240/Mesut-Ozil
## 2 Lionel Messi https://fbref.com//en/players/d70ce98e/Lionel-Messi
## 3 Cristiano Ronaldo https://fbref.com//en/players/dea698d9/Cristiano-Ronaldo
## 4 Erling Haaland https://fbref.com//en/players/1f44ac21/Erling-Haaland
## 5 Enzo Le Fée https://fbref.com//en/players/6f806d99/Enzo-Le-Fee
## 6 Axel Disasi https://fbref.com//en/players/ad82197c/Axel-Disasi
Can you get some player characteristics for Lionel Messi?
Can you wrap this into a loop for multiple players? USE
sys.sleep() TO PAUSE!
Now we have a time-series object of the most trending player’s seasonal performances.
head(player_stats.df)This is from a short course Introduction to geospatial data and analysis
Lets get some administrative boundaries for London from the London Datastore.
# Download zip file and unzip
tmpf <- tempfile()
boundary.link <- "https://data.london.gov.uk/download/statistical-gis-boundary-files-london/9ba8c833-6370-4b11-abdc-314aa020d5e0/statistical-gis-boundaries-london.zip"
download.file(boundary.link, tmpf)
unzip(zipfile = tmpf, exdir = paste0(dn))
unlink(tmpf)
# This is a shapefile
# We only need the MSOA layer for now
msoa.spdf <- st_read(dsn = paste0(dn, "/statistical-gis-boundaries-london/ESRI"),
layer = "MSOA_2011_London_gen_MHW" # Note: no file ending
)## Reading layer `MSOA_2011_London_gen_MHW' from data source
## `C:\work\Lehre\Web_Scraping\02_Exercise\_data\statistical-gis-boundaries-london\ESRI'
## using driver `ESRI Shapefile'
## Simple feature collection with 983 features and 12 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: 503574.2 ymin: 155850.8 xmax: 561956.7 ymax: 200933.6
## Projected CRS: OSGB 1936 / British National Grid
head(msoa.spdf)## Simple feature collection with 6 features and 12 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: 530966.7 ymin: 180510.7 xmax: 551943.8 ymax: 191139
## Projected CRS: OSGB 1936 / British National Grid
## MSOA11CD MSOA11NM LAD11CD LAD11NM RGN11CD
## 1 E02000001 City of London 001 E09000001 City of London E12000007
## 2 E02000002 Barking and Dagenham 001 E09000002 Barking and Dagenham E12000007
## 3 E02000003 Barking and Dagenham 002 E09000002 Barking and Dagenham E12000007
## 4 E02000004 Barking and Dagenham 003 E09000002 Barking and Dagenham E12000007
## 5 E02000005 Barking and Dagenham 004 E09000002 Barking and Dagenham E12000007
## 6 E02000007 Barking and Dagenham 006 E09000002 Barking and Dagenham E12000007
## RGN11NM USUALRES HHOLDRES COMESTRES POPDEN HHOLDS AVHHOLDSZ
## 1 London 7375 7187 188 25.5 4385 1.6
## 2 London 6775 6724 51 31.3 2713 2.5
## 3 London 10045 10033 12 46.9 3834 2.6
## 4 London 6182 5937 245 24.8 2318 2.6
## 5 London 8562 8562 0 72.1 3183 2.7
## 6 London 8791 8672 119 50.6 3441 2.5
## geometry
## 1 MULTIPOLYGON (((531667.6 18...
## 2 MULTIPOLYGON (((548881.6 19...
## 3 MULTIPOLYGON (((549102.4 18...
## 4 MULTIPOLYGON (((551550 1873...
## 5 MULTIPOLYGON (((549099.6 18...
## 6 MULTIPOLYGON (((549819.9 18...
This looks like a conventional data.frame but has the
additional column geometry containing the coordinates of
each observation. st_geometry() returns only the geographic
object and st_drop_geometry() only the
data.frame without the coordinates. We can plot the object
using mapview().
mapview(msoa.spdf[, "POPDEN"])Now that we have some boundaries and shapes of spatial units in London, we can start looking for different data sources to populate the geometries.
A good source for demographic data is for instance the 2011 census. Below we use the nomis API to retrieve population data for London, See the Vignette for more information (Guest users are limited to 25,000 rows per query). Below is a wrapper to avoid some mess with sex and urban-rural cross-tabs.
### For larger request, register and set key
# Sys.setenv(NOMIS_API_KEY = "XXX")
# nomis_api_key(check_env = TRUE)
# Get London ids
london_ids <- msoa.spdf$MSOA11CD
### Get key statistics ids
# select required tables (https://www.nomisweb.co.uk/sources/census_2011_ks)
# Let's get KS201EW (ethnic group) and KS402EW (housing tenure)From the Nomis website https://www.nomisweb.co.uk/sources/census_2011_ks, we can get the external identifiers that we want to pull: KS201EW (ethnic group) and KS402EW (housing tenure). Unfortunately, internal keys differ (for what reason ever…).
# Get table of IDs and description
x <- nomis_data_info()
head(x)## # A tibble: 6 × 14
## agencyid id uri version annot…¹ compo…² compo…³ compo…⁴ compo…⁵ compo…⁶
## <chr> <chr> <chr> <dbl> <list> <list> <list> <chr> <chr> <chr>
## 1 NOMIS NM_1_1 Nm-1d1 1 <df> <df> <df> OBS_VA… CL_1_1… TIME
## 2 NOMIS NM_2_1 Nm-2d1 1 <df> <df> <df> OBS_VA… CL_2_1… TIME
## 3 NOMIS NM_4_1 Nm-4d1 1 <df> <df> <df> OBS_VA… CL_4_1… TIME
## 4 NOMIS NM_5_1 Nm-5d1 1 <df> <df> <df> OBS_VA… CL_5_1… TIME
## 5 NOMIS NM_6_1 Nm-6d1 1 <df> <df> <df> OBS_VA… CL_6_1… TIME
## 6 NOMIS NM_7_1 Nm-7d1 1 <df> <df> <df> OBS_VA… CL_7_1… TIME
## # … with 4 more variables: description.value <chr>, description.lang <chr>,
## # name.value <chr>, name.lang <chr>, and abbreviated variable names
## # ¹annotations.annotation, ²components.attribute, ³components.dimension,
## # ⁴components.primarymeasure.conceptref, ⁵components.timedimension.codelist,
## # ⁶components.timedimension.conceptref
So we have to find the internal ids of KS201EW and KS402EW.
# Get internal ids
stats <- c("KS201EW", "KS402EW")
oo <- which(grepl(paste(stats, collapse = "|"), x$name.value))
ksids <- x$id[oo]
ksids # This are the internal ids## [1] "NM_608_1" "NM_619_1"
Finally, we can start pulling the information from the Nomis census APi! Below is a wrapper to avoid some mess with sex and urban-rural cross-tabs. Some characteristics can be pulled separately for urban-rural or by sex - In that case we have to specify it, but we cannot specify the option if a variable does not have urban-rural or sex differentials. The wrapper below also creates a codebook with variable descriptions.
### look at meta information
q <- nomis_overview(ksids[1])
head(q)## # A tibble: 6 × 2
## name value
## <chr> <list>
## 1 analyses <named list [1]>
## 2 analysisname <chr [1]>
## 3 analysisnumber <int [1]>
## 4 contact <named list [4]>
## 5 contenttypes <named list [1]>
## 6 coverage <chr [1]>
a <- nomis_get_metadata(id = ksids[1], concept = "GEOGRAPHY", type = "type")
a # TYPE297 is MSOA level## # A tibble: 24 × 3
## id label.en description.en
## <chr> <chr> <chr>
## 1 TYPE265 NHS area teams NHS area teams
## 2 TYPE266 clinical commissioning groups clinical commissi…
## 3 TYPE267 built-up areas including subdivisions built-up areas in…
## 4 TYPE269 built-up areas built-up areas
## 5 TYPE273 national assembly for wales electoral regions 2010 national assembly…
## 6 TYPE274 postcode areas postcode areas
## 7 TYPE275 postcode districts postcode districts
## 8 TYPE276 postcode sectors postcode sectors
## 9 TYPE277 national assembly for wales constituencies 2010 national assembly…
## 10 TYPE279 parishes 2011 parishes 2011
## # … with 14 more rows
b <- nomis_get_metadata(id = ksids[1], concept = "MEASURES", type = "TYPE297")
b # 20100 is the measure of absolute numbers## # A tibble: 2 × 3
## id label.en description.en
## <chr> <chr> <chr>
## 1 20100 value value
## 2 20301 percent percent
### Query data in loop over the required statistics
for(i in ksids){
# Determin if data is divided by sex or urban-rural
nd <- nomis_get_metadata(id = i)
if("RURAL_URBAN" %in% nd$conceptref){
UR <- TRUE
}else{
UR <- FALSE
}
if("C_SEX" %in% nd$conceptref){
SEX <- TRUE
}else{
SEX <- FALSE
}
# make data request
if(UR == TRUE){
if(SEX == TRUE){
tmp_en <- nomis_get_data(id = i, time = "2011",
geography = london_ids, # replace with "TYPE297" for all MSOAs
measures = 20100, RURAL_URBAN = 0, C_SEX = 0)
}else{
tmp_en <- nomis_get_data(id = i, time = "2011",
geography = london_ids, # replace with "TYPE297" for all MSOAs
measures = 20100, RURAL_URBAN = 0)
}
}else{
if(SEX == TRUE){
tmp_en <- nomis_get_data(id = i, time = "2011",
geography = london_ids, # replace with "TYPE297" for all MSOAs
measures = 20100, C_SEX = 0)
}else{
tmp_en <- nomis_get_data(id = i, time = "2011",
geography = london_ids, # replace with "TYPE297" for all MSOAs
measures = 20100)
}
}
# Append (in case of different regions)
ks_tmp <- tmp_en
# Make lower case names
names(ks_tmp) <- tolower(names(ks_tmp))
names(ks_tmp)[names(ks_tmp) == "geography_code"] <- "msoa11"
names(ks_tmp)[names(ks_tmp) == "geography_name"] <- "name"
# replace weird cell codes
onlynum <- which(grepl("^[[:digit:]]+$", ks_tmp$cell_code))
if(length(onlynum) != 0){
code <- substr(ks_tmp$cell_code[-onlynum][1], 1, 7)
if(is.na(code)){
code <- i
}
ks_tmp$cell_code[onlynum] <- paste0(code, "_", ks_tmp$cell_code[onlynum])
}
# save codebook
ks_cb <- unique(ks_tmp[, c("date", "cell_type", "cell", "cell_code", "cell_name")])
### Reshape
ks_res <- tidyr::pivot_wider(ks_tmp, id_cols = c("msoa11", "name"),
names_from = "cell_code",
values_from = "obs_value")
### Merge
if(i == ksids[1]){
census_keystat.df <- ks_res
census_keystat_cb.df <- ks_cb
}else{
census_keystat.df <- merge(census_keystat.df, ks_res, by = c("msoa11", "name"), all = TRUE)
census_keystat_cb.df <- rbind(census_keystat_cb.df, ks_cb)
}
}
# Descirption are saved in the codebook
head(census_keystat_cb.df)## # A tibble: 6 × 5
## date cell_type cell cell_code cell_name
## <dbl> <chr> <dbl> <chr> <chr>
## 1 2011 Ethnic Group 0 KS201EW0001 All usual residents
## 2 2011 Ethnic Group 100 KS201EW_100 White
## 3 2011 Ethnic Group 1 KS201EW0002 White: English/Welsh/Scottish/Northern I…
## 4 2011 Ethnic Group 2 KS201EW0003 White: Irish
## 5 2011 Ethnic Group 3 KS201EW0004 White: Gypsy or Irish Traveller
## 6 2011 Ethnic Group 4 KS201EW0005 White: Other White
### Merge with MSOA geometries above
msoa.spdf <- merge(msoa.spdf, census_keystat.df,
by.x = "MSOA11CD", by.y = "msoa11", all.x = TRUE)Now we can, for instance, plot the spatial distribution of ethnic groups.
msoa.spdf$per_white <- msoa.spdf$KS201EW_100 / msoa.spdf$KS201EW0001 * 100
mapview(msoa.spdf[, "per_white"])